---
title: "Dinger, Stein, Zhao - Replication assignment code"
output: 
  pdf_document: default
  html_document: default
header-includes:
  - \usepackage[labelformat = empty]{caption}
  - \usepackage[textfont = bf]{caption}
fig_width: 3
fig_height: 2
urlcolor: blue
---

# Introduction

This code was created by Tai Dinger, Dorit Stein, and Lavinia Zhao as part of the replication assignment in Gary King's Gov 2001 class, Fall 2022 at Harvard University. We replicated the paper "Solís Arce JS, Warren SS, Meriggi NF, Scacco A, McMurry N, Voors M, Syunyaev G, Malik AA, Aboutajdine S, Adeojo O, Anigo D. COVID-19 vaccine acceptance and hesitancy in low-and middle-income countries. Nature medicine. 2021 Aug;27(8):1385-94"

Below code is a combination of code from Solis et al.'s replication files ("original analysis") and our own extensions ("extension analysis"). 

##--------------------------------
# SET-UP CODE
##--------------------------------

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r include = F}
if (!require(pacman)) install.packages("pacman")

pacman::p_load(tidyverse, magrittr, knitr, kableExtra, DT, lazyeval, 
               labelled, forcats, readxl, googledrive, estimatr, ggforce, fastDummies,
               stringr, RColorBrewer, readstata13, metaplus, sjlabelled, ggrepel, tikzDevice,
               lfe, stats, startgazer,pROC,dotwhisker)

if (!require(devtools)) install.packages("devtools"); library(devtools)

install_github("IQSS/RobustSE")
library(RobustSE)

rm(list=ls()) # removes all objects from the environment
cat('\014') # clears the console
```

##--------------------------------
# FUNCTIONS FROM ORIGINAL ANALYSIS
##--------------------------------

# The helper renormalizes weights so that each study gets the 
# same total weight even if they are missing data

```{r}
study_weighting <- function(data)
    data %>% 
    dplyr::group_by(country) %>% 
    dplyr::mutate(weight = weight/sum(weight)) %>% 
    dplyr::ungroup() 

lm_helper <- function(data,
                      ...) {
    data %>% 
        study_weighting() %>% 
        estimatr::lm_robust(data = .,...) %>% 
        {bind_cols( tidy(.), n = nobs(.) )}
}

# Leave-X-out helper that takes data and sample_var, 
# nests by sample_var performs LOO with loo_n observations out
# applying loo_fun to each sample

loo_helper <- function(data, 
                       sample_var, 
                       loo_n = 1,
                       loo_fun = 
                           function(dat) 
                               lm_helper(data = dat, 
                                         formula = take_vaccine_num ~ 1, 
                                         cluster = cluster,
                                         weight = weight, 
                                         se_type = "stata")) {
    
    .var <- data[[sample_var]]
    
    # Data is recombined and loo_fun applied to each combination 
    data %>% 
        {
            plyr::adply(.data = combn(unique(.var), loo_n), 
                        .margins = 2, 
                        .fun = function(x) loo_fun(.[!(.var %in% x), ]) )
        }
}

# # Illustration of loo_helper
# loo_helper(
#   data = data.frame(country = 1:3, Y = 1:3),
#   loo_n = 2,
#   sample_var = "country",
#   loo_fun = function(dat) mean(dat$Y)
# )

# Subgroup analysis : Function to apply analysis function over groups

grp_analysis <- function(df, 
                         y, 
                         x) {
    df %>%
        dplyr::filter(if_all(c(all_of(x), all_of(y), cluster, weight), ~ !is.na(.))) %>%
        dplyr::nest_by(group, get(x)) %>%
        dplyr::summarize(
            lm_helper(data = data, 
                      formula = as.formula(paste0(y, "~ 1")), cluster = cluster,
                      weight = weight, se_type = "stata"), .groups = "drop") %>% 
        dplyr::rename(!!x := "get(x)")
}

# Reasons analysis: Function to apply analysis function over groups

reasons_together <- function(df, 
                             reason, 
                             num = "Yes") {
    df %>%
        dplyr::filter(take_vaccine %in% num, 
                      if_all(c(all_of(reason), cluster, weight), ~ !is.na(.))) %>%
        dplyr::nest_by(group) %>%
        dplyr::summarize(
            lm_helper(data = data, 
                      formula = as.formula(paste0(reason, "~ 1")), 
                      cluster = cluster,
                      weight = weight, se_type = "stata"), .groups = "drop")
}


reasons_together_subgroup <- function(df, 
                                      reason, num = "Yes", 
                                      dem_group = NA, 
                                      dem_subgroup = NA) {
    
    if (dem_group == "gender")
        df <- filter(df, gender %in% dem_subgroup)
    
    df %>%
        dplyr::filter(take_vaccine %in% num,
                      !is.na(get(reason))) %>%
        dplyr::nest_by(group) %>%
        dplyr::summarize(
            lm_helper(data = data, 
                      formula = as.formula(paste0(reason, "~ 1")), cluster = cluster,
                      weight = weight, se_type = "stata"), .groups = "drop")
}

# Age analysis for reasons

age_analysis <- function(df, 
                         reason, 
                         num = "Yes", 
                         filter_by = NA) {
    df %>%
        dplyr::filter({{filter_by}} == 1)  %>%
        dplyr::filter(take_vaccine %in% num, 
                      if_all(c(all_of(reason), cluster, weight), ~ !is.na(.))) %>%
        dplyr::nest_by(group) %>%
        dplyr::summarize(
            lm_helper(data = data, 
                      formula = as.formula(paste0(reason, "~ 1")), 
                      cluster = cluster,
                      weight = weight, se_type = "stata"), .groups = "drop")
}
```

##--------------------------------
# IMPORT AND CLEAN DATA (ORIGINAL ANALYSIS)
##--------------------------------

```{r}
# Call data created in 1_cleaning.Rmd
  # data from Dataverse - "combined.csv"
df <- read.csv("~/Dropbox/GOV2001 Replication Paper/Extensions/combined.csv")

# If no cluster information given for a study then individuals are clusters 
# Ensure cluster ids are distinct across studies
df <- 
    df %>% 
    dplyr::group_by(study) %>% 
    dplyr::mutate(
        cluster = ifelse(is.na(cluster), paste(1:n()), cluster),
        cluster = paste0(gsub(" ", "_", tolower(country)), "_", cluster))


# Weights sum to 1 in each study and recode age and education into bins
df <- 
    df %>% 
    dplyr::group_by(study) %>% 
    dplyr::mutate(
        weight_replace = mean(weight, rm.na = TRUE),
        weight = if_else(is.na(weight), 
                         if_else(is.na(weight_replace), 1, weight_replace), 
                         weight),
        weight = weight/sum(weight)) %>% 
    dplyr::ungroup() %>%
    dplyr::mutate(
        age_groups = 
            as.character(cut(x = age, breaks = c(-Inf, 18, 30, 45, 60, +Inf), right = F)),
        age_groups_binary = ifelse(age >= 55, "55+", NA),
        age_groups_binary = ifelse(age < 55, "<55", age_groups_binary),
        age_less24 = ifelse(age <= 24, 1, 0),
        age_25_54 = ifelse(age >= 25 & age <= 54, 1, 0),
        age_55_more = ifelse(age >= 55, 1, 0),
        age_groups_three = ifelse(age <= 24, "<25", NA),
        age_groups_three = ifelse(age >= 25 & age <= 54, "25-54", age_groups_three),
        age_groups_three = ifelse(age >= 55, "55+", age_groups_three),
        educ_binary = if_else(educ == "More than secondary", "> Secondary", "Up to Secondary")) 


# We create a new dataframe with countries and with "All" (only LMICs). Countries are clusters in "All" analysis
# USA and Russia excluded from "All" set

df2 <- 
    dplyr::bind_rows(
        mutate(df, group = country),
        mutate(filter(df, country != "USA" & country != "Russia"), group = "All")) %>% 
    mutate(
        cluster = if_else(group == "All", 
                          gsub(pattern = " ", replacement = "_", x = tolower(country)), 
                          cluster)) %>%
  mutate(vac_outcome = ifelse(take_vaccine == "Yes", 1, 0)) %>%
  mutate(gender_coded = ifelse(gender == 'Male',1,0))

```

##--------------------------------
# MODELS (ORIGINAL ANALYSIS)
##--------------------------------

```{r}

# Data for figure 1 in the paper

# weighted average by group (gaussian)
df2 %>%
    nest_by(group) %>%
    summarize(
        lm_robust(formula = take_vaccine_num ~ 1, data = data, 
                        weights = weight, clusters = cluster, se_type = "stata") %>%
            {bind_cols( tidy(.), n = nobs(.) )}) %>%
  kable() %>% kable_material_dark()

```

##--------------------------------
# MODELS (EXTENSION ANALYSIS)
##--------------------------------

1. Choosing best fit models 

+ Comparing linear v logit, and with or without covariates
+ Look at average predicted probabilities, and ROC curves to choose best fit model

```{r}

# remove countries missing data for covariates
df3 <- df2 %>%
  filter(country != "Nepal") %>%
  filter(country != "Pakistan 2") %>%
  filter(country != "Nigeria") 

df3 <- df3 %>%
  mutate(educ_binary2 = ifelse(educ_binary == "> Secondary", 1, 0))

# Model fitting
# comparing gaussian to logit regressions of survey ~ age, educ, and gender

# linear regressions
model_lm0 <- glm(vac_outcome ~ I(country),
                 data = df3, 
                 family = 'gaussian')
model_lm1 <- glm(vac_outcome ~ age + factor(educ_binary2) + factor(gender) + I(country), 
                data = df3, 
                family = 'gaussian')


# logit models
model_logit0 <- glm(vac_outcome ~ I(country),
                    data = df3,
                    family = binomial(link = 'logit'))

model_logit1 <- glm(vac_outcome ~ age + factor(educ_binary2) + factor(gender) + I(country), 
                   data = df3, 
                   family = binomial(link = 'logit'))

model_logit2 <- glm(vac_outcome ~ age + factor(educ_binary2) + factor(gender), 
                   data = df3, 
                   family = binomial(link = 'logit'))


# avg prob of uptake
# average predicted probability of uptake, by country

df3$logit0 <- predict.glm(model_logit0, 
              newdata=df3,
  type = 'response')

df3$logit1 <- predict.glm(model_logit1, 
              newdata=df3,
  type = 'response')

df3$logit2 <- predict.glm(model_logit2, 
              newdata=df3,
  type = 'response')



df3$linear0 = predict(model_lm0, 
              newdata=df3,
  type = 'response')

df3$linear1 = predict(model_lm1, 
              newdata=df3,
  type = 'response')

library(stargazer)

stargazer(model_lm0, model_lm1, type = 'text')
stargazer(model_logit0, model_logit1, type = 'text')

# average predicted probabilities by model
sum_logit <- df3 %>%
  group_by(country) %>%
  summarize(prob_uptake0 = mean(logit0, na.rm = T),
            prob_line0 = mean(linear0, na.rm = T),
            prob_uptake1 = mean(logit1, na.rm = T),
            prob_line1 = mean(linear1, na.rm = T)) %>%
  filter(country != "Nepal") %>%
  filter(country != "Pakistan 2") %>%
  filter(country != "Nigeria") %>%
  pivot_longer(cols = prob_uptake0:prob_line1, values_to = "prob", names_to = "model_type") %>%
  mutate(type = case_when(model_type == "prob_line1" | 
                            model_type == "prob_line0" | 
                            model_type == "prob_line2" ~ "linear",
                          TRUE ~ "logistic"),
         model_name = case_when(model_type == "prob_line1"  ~ "Full linear - age, educ, gender, country FE", 
                            model_type == "prob_line0" ~ "Null linear - country FE", 
                            model_type == "prob_uptake0" ~ "Null logit - country FE",
                            model_type == "prob_uptake1" ~ "Full logit - age, educ, gender, country FE",
                          TRUE ~ "NA"))

# plot avg predicted probabilities by country and model
ggplot(data = sum_logit, aes(x = prob, y = country, group = model_name, color = model_name)) +
  geom_point() +
  facet_wrap(~type) +
  theme_minimal() +
  theme(legend.position = "right")

ggsave("figure-model-fit.eps")

# ROC curves to assess model fit

# LOGITS
# country fixed effects only
logit_null <- roc(df3$vac_outcome ~ df3$logit0, plot = T, print.auc = T)
logit_null_auc <- logit_null$auc
# age + factor(educ) + gender + I(country)
logit_full <- roc(df3$vac_outcome ~ df3$logit1, plot = T, print.auc = T)
logit_full_auc <- logit_full$auc

# LINEAR
# country fixed effects only
linear_null <- roc(df3$vac_outcome ~ df3$linear0, plot = T, print.auc = T)
linear_null_auc <- linear_null$auc
# age + factor(educ) + gender
linear_full <- roc(df3$vac_outcome ~ df3$linear1, plot = T, print.auc = T)
linear_full_auc <- linear_full$auc


auc <- tibble(AUC = c(linear_null_auc, linear_full_auc, logit_null_auc, logit_full_auc),
              Model = c("Null linear - country FE", "Full linear - age, educ, gender, country FE", "Null logit - country FE", "Full logit - age, educ, gender, country FE")) %>%
  select(Model, AUC) %>%
  mutate(AUC = round(AUC, 2))

# plot ROCs together

ggroc(list("Null linear - country FE" = linear_null, 
           "Full linear - age, educ, gender, country FE" = linear_full,
           "Null logit - country FE" = logit_null, 
           "Full logit - age, educ, gender, country FE" = logit_full)) +
  facet_wrap(~name) +
  theme_classic() +
  theme(legend.position = "") +
  ggtitle("ROCs")

ggsave("roc.eps")

saveRDS(auc, "auc-table.rds")

```


```{r eval = F}
# GIM 
# Our homoskedastic and HC standard errors are slightly closer when using a logit model

GIM(model_lm1, full = F)
GIM(model_logit1, full = F)

```


##----------------------------------------
# Functions for simulation (Extension analysis)
##----------------------------------------

```{r}
set.seed(2001)
# Functions needed to run simulations

# simulate the coefficients
sim_coeff <- function(coeff, vcov) {
  return(mvtnorm::rmvnorm(1, mean = coeff, sigma = vcov))
}
#coeff_sim <- sim_coeff(model_logit$coeff, vcov(model_logit))
# logistic function
invlogit <- function(x) {exp(x) / (1 + exp(x))}

# calculate the predicted probability
sim_E <- function(data_sim, coeff_sim) {
  Y <- invlogit(as.matrix(data_sim) %*% t(coeff_sim) )
  return(Y)
}

# simulate quantity of interest (predicted probability)
sim_run <- function(fit, data_sim) {
  # Step 1: Simulate coefficients
  coeff_sim <- sim_coeff(fit$coeff, vcov(fit))
  # Step 2 and Step 3: Simulate Y 
  # and calculate the QOI (fundamental uncertainty)
  Y <- sim_E(data_sim, coeff_sim)
  return(Y)
}
```

##----------------------------------------
# Simulations (extension analysis)
##----------------------------------------

Simulation 1: 

+ Varying levels of (binary) education, holding age at country-level means, by gender and country. 

```{r}

df3$educ_binary2 <- as.factor(df3$educ_binary2)
df3$country <- as.factor(df3$country)
df3$gender_coded <- as.factor(df3$gender_coded)

# Create the dataset, varying education
# get the means age and gender
sim_data <- df3 %>%
  group_by(country, gender_coded) %>%
  summarize(age = mean(age, na.rm = T)) %>%
  filter(!is.na(gender_coded)) %>%
  mutate('(Intercept)' = 1) %>%
  mutate(educ_binary2 = 0)

sim_data2 <- df3 %>%
  group_by(country, gender_coded) %>%
  summarize(age = mean(age, na.rm = T)) %>%
  filter(!is.na(gender_coded)) %>%
  mutate('(Intercept)' = 1) %>%
  mutate(educ_binary2 = 1)

# dataframe that fits the model output
sim_data3 <- rbind(sim_data, sim_data2) %>%
  ungroup() %>%
  mutate('I(country)Colombia' = ifelse(country == "Colombia", 1, 0),
         'I(country)India' = ifelse(country == "India", 1, 0),
         'I(country)Mozambique' = ifelse(country == "Mozambique", 1, 0),
         'I(country)Pakistan 1' = ifelse(country == "Pakistan 1", 1, 0),
         'I(country)Russia' = ifelse(country == "Russia", 1, 0),
         'I(country)Rwanda' = ifelse(country == "Rwanda", 1, 0),
         'I(country)Sierra Leone 1' = ifelse(country == "Sierra Leone 1", 1, 0),
         'I(country)Sierra Leone 2' = ifelse(country == "Sierra Leone 2", 1, 0),
         'I(country)Uganda 1' = ifelse(country == "Uganda 1", 1, 0),
         'I(country)Uganda 2' = ifelse(country == "Uganda 2", 1, 0),
         'I(country)USA' = ifelse(country == "USA", 1, 0)) 

data_sim <- sim_data3 %>%
  select('(Intercept)', age, educ_binary2, gender_coded, 'I(country)Colombia':'I(country)USA') %>%
  mutate(across(everything(), ~as.numeric(.)))

# the model we are using
model_logit <- glm(vac_outcome ~ age + educ_binary2 + gender_coded + I(country), 
                   data = df3, 
                   family = binomial(link = 'logit'))

# the simulation
response_sim_educ <- replicate(10, sim_run(model_logit, data_sim), simplify = 'matrix')

# the plot
sim_results <- tibble(point = rowMeans(response_sim_educ)) %>%
  bind_cols(apply(response_sim_educ, 1, quantile, c(0.025, 0.975)) %>% 
              t() %>%  as_tibble()) %>%
  cbind(sim_data3) # with country names

ggplot(data = sim_results, aes(y = point, x = as.factor(educ_binary2), group = as.factor(gender_coded), color= as.factor(gender_coded))) +
  geom_point()+
  geom_errorbar(aes(ymin = `2.5%`, ymax = `97.5%`,color = as.factor(gender_coded)), width = 0.1) +
  ylim(0,1) +
  labs(x = 'Education level', y = 'p', title = 'Estimated probability of answering yes to vaccination question by education') +
  facet_grid(rows = vars(country))

ggsave("plot1.eps")
  
```

Simulation 2: 

+ Varying age - by gender, country, and education level

```{r}

df3$educ_binary2 <- as.factor(df3$educ_binary2)
df3$country <- as.factor(df3$country)
df3$gender_coded <- as.factor(df3$gender_coded)

# age range (17-101),n = 85 ages
age_range <- seq(min(df3$age[!is.na(df3$age)]), max(df3$age[!is.na(df3$age)]), 1)

# Create the dataset, varying education
# get the means age and gender
sim_data <- df3 %>%
  group_by(country, gender_coded) %>%
  summarize() %>%
  filter(!is.na(gender_coded)) %>%
  mutate('(Intercept)' = 1) %>%
  mutate(educ_binary2 = 0) 

# repeat age ranges for the 23 rows in the sim_data
age <- rep(age_range, 23)

# repeat sim_data 85 times and add age
data_frame_mod <- do.call("rbind", replicate(
  85, sim_data, simplify = FALSE)) %>%
  cbind(age = age) 

sim_data2 <- df3 %>%
  group_by(country, gender_coded) %>%
  summarize() %>%
  filter(!is.na(gender_coded)) %>%
  mutate('(Intercept)' = 1) %>%
  mutate(educ_binary2 = 1)

data_frame_mod2 <- do.call("rbind", replicate(
  85, sim_data2, simplify = FALSE)) %>%
  cbind(age = age) 

# dataframe that fits the model output
sim_data3 <- rbind(data_frame_mod, data_frame_mod2) %>%
  ungroup() %>%
  mutate('I(country)Colombia' = ifelse(country == "Colombia", 1, 0),
         'I(country)India' = ifelse(country == "India", 1, 0),
         'I(country)Mozambique' = ifelse(country == "Mozambique", 1, 0),
         'I(country)Pakistan 1' = ifelse(country == "Pakistan 1", 1, 0),
         'I(country)Russia' = ifelse(country == "Russia", 1, 0),
         'I(country)Rwanda' = ifelse(country == "Rwanda", 1, 0),
         'I(country)Sierra Leone 1' = ifelse(country == "Sierra Leone 1", 1, 0),
         'I(country)Sierra Leone 2' = ifelse(country == "Sierra Leone 2", 1, 0),
         'I(country)Uganda 1' = ifelse(country == "Uganda 1", 1, 0),
         'I(country)Uganda 2' = ifelse(country == "Uganda 2", 1, 0),
         'I(country)USA' = ifelse(country == "USA", 1, 0)) 

data_sim <- sim_data3 %>%
  select('(Intercept)', age, educ_binary2, gender_coded, 'I(country)Colombia':'I(country)USA') %>%
  mutate(across(everything(), ~as.numeric(.)))

# the model we are using
model_logit <- glm(vac_outcome ~ age + educ_binary2 + gender_coded + I(country), 
                   data = df3, 
                   family = binomial(link = 'logit'))

# the simulation
response_sim_age <- replicate(5000, sim_run(model_logit, data_sim), simplify = 'matrix')

# the plot
sim_results <- tibble(point = rowMeans(response_sim_age)) %>%
  bind_cols(apply(response_sim_age, 1, quantile, c(0.025, 0.975)) %>% 
              t() %>%  as_tibble()) %>%
  cbind(sim_data3) # with country names

# save simulation results
saveRDS(sim_results, "response_sim_age.rds")

ggplot(data = sim_results %>% filter(educ_binary2 == 0), aes(y = point, x = age, group = as.factor(gender_coded), fill = as.factor(gender_coded), color = as.factor(gender_coded))) +
  geom_smooth(size = 0.5, formula = y ~ x, method = 'loess') +
  geom_ribbon(aes(ymin = `2.5%`, ymax = `97.5%`), alpha = 0.2) +
  facet_wrap(~country) +
  theme_minimal() +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name="Gender",
                      labels=c("Male", "Female")) +
  scale_color_discrete(guide = "none") +
  labs(x = 'Age', y = 'p', title = 'Estimated probability of answering yes to vaccination question \nby country and gender, for < secondary education')

ggsave("plot1.eps")

ggplot(data = sim_results %>% filter(educ_binary2 == 1), aes(y = point, x = age, group = as.factor(gender_coded), fill = as.factor(gender_coded), color = as.factor(gender_coded))) +
  geom_smooth(size = 0.5, formula = y ~ x, method = 'loess') +
  geom_ribbon(aes(ymin = `2.5%`, ymax = `97.5%`), alpha = 0.2) +
  facet_wrap(~country) +
    theme_minimal() +
  theme(legend.position = "bottom") +
  scale_fill_discrete(name="Gender",
                      labels=c("Male", "Female")) +
  scale_color_discrete(guide = "none") +
  labs(x = 'Age', y = 'p', title = 'Estimated probability of answering yes to vaccination question \nby country and gender, for > secondary education')

ggsave("plot2.eps")

```











